package com.ndood.core.util;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringEscapeUtils;

import us.codecraft.webmagic.selector.Html;

public class HtmlStrUtil {

	/**
	 * 去掉引号，并生成新的page.html
	 * @param string
	 * @return 
	 */
	public static Html translate(String string) {
		// System.out.println(string);
		// 替换所有斜杠
		string = string.replaceAll("\\\\", "");
		// System.out.println(string);
		
		// 去除所有&quot;
		string = string.replaceAll("&quot;", "");
		// System.out.println(string);
		
		// " xxx"=" -> xxx
		string = groupReplace(string,"\"\\s([0-9A-z]+)\"=\"","[^0-9A-z]");
		// System.out.println(string);

		// 替换掉没意义的
		string = string.replaceAll("\"=\"\"", "");
		string = string.replaceAll("/\">", "\"/>");
		
		// 转换其它html标签
		string = StringEscapeUtils.unescapeHtml(string);
		// System.out.println(string);

		Html html = new Html(string);
		return html;
	}
	
	/**
	 * 组替换
	 */
	public static String groupReplace(String src,String reg,String delReg){
		Pattern p = Pattern.compile(reg);
		Matcher m = p.matcher(src);
		while(m.find()){
			String tmp = m.group().replaceAll(delReg, "");
			src = src.replaceFirst(reg, " "+tmp+" ");
		}
		return src;
	}
	
}
